# coding=utf-8
# 把论文数据集中的每一段数据按句号，构建简短句
import random
import re

pattern = r'\.|/|;|\'|`|\[|\]|<|>|\?|:|"|\{|\}|\~|!|@|#|\$|%|\^|&|\(|\)|-|=|\_|\+|。|、|；|‘|’|【|】|·|！| |…|（|）'
pattern2 = r'\.|。'


def process(corpus_folder, save_file, save_folder):
    corpus_list = []
    with open(corpus_folder, "r") as fr:
        lines = fr.readlines()
    print("we have %d lines" % len(lines))

    i, x = 0, 0

    for line in random.sample(lines, 10000):
        result_list = re.split(pattern2, line)
        for sentence in result_list[1:]:
            if 50 < len(sentence) < 100:
                i += 1
                corpus_list.append(sentence + "。\n")
        if i // 1000 == 1:
            x += 1
            print("构建了 %d 个句子" % (x * 1000))
            with open(save_file, "a") as fw:
                fw.writelines(corpus_list)
            i = 0
            corpus_list = []
    with open(save_file, "a") as fw:
        fw.writelines(corpus_list)


if __name__ == "__main__":
    corpus_folder = "/data/corpus_hj/1.txt"
    save_file = "/data/corpus_hj/save_random.txt"
    save_folder = "/home/cloudminds/Mywork/corpus/rephrase_corpus"
    process(corpus_folder, save_file, save_folder)
